1 Setup

This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)

  • Libraries
suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(magrittr)
  library(parallel)
  library(plotly)
  library(pvclust)
  library(tidyverse)
  library(tximport)
  library(vsn)
  library(emoji)
})
  • Helper functions
source(here("UPSCb-common/src/R/featureSelection.R"))
  • Graphics
hpal <- colorRampPalette(c("blue","white","red"))(100)

2 Data

  • Sample information
samples <- read_csv(here("data/drought_roots.csv"),
                      col_types=cols(.default=col_factor()))
  • tx2gene translation table
tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.tsv.gz"), delim="\t", col_names=c("TXID","GENE"), skip=1))
  • Raw data
filelist <- list.files(here("results/Salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)
  • Sanity check to ensure that the data is sorted according to the sample info
stopifnot(all(match(sub("[1-3]_151118_BC852HANXX_P2503_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(filelist)))),
                    samples$SciLifeID) == 1:nrow(samples)))

samples_rep <-rbind(samples,samples)
samples_rep <- rbind(samples_rep,samples)
  • add filelist to samples as a new column
samples_rep %<>% mutate(Filenames = filelist)
samples_rep$BioID <- sub("[1-3]_151118_BC852HANXX_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
  • Read the expression at the gene level
txi <- suppressMessages(tximport(files = samples_rep$Filenames,
                                 type = "salmon",
                                 tx2gene=tx2gene))
counts <- txi$counts

Combine technical replicates

counts <- do.call(
  cbind,
  lapply(split.data.frame(t(counts),
                          samples$SciLifeID),
         colSums))

csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$SciLifeID),]
colnames(counts) <- csamples$SciLifeID

 

3 Quality Control

3.1 “Not expressed” genes

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "8.1% percent (4306) of 53142 genes are not expressed"

3.2 Sequencing depth

  • Let us take a look at the sequencing depth, colouring by Level
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(csamples)

ggplot(dat,aes(x,y,fill=Level)) + 
  geom_col() + 
  scale_y_continuous(name="reads") +
  facet_grid(~ factor(Level), scales = "free") +
  theme_bw() + 
  theme(axis.text.x=element_text(angle=90,size=4), axis.title.x=element_blank()) +
  labs(title ="Sample sequencing depth")

👉 We observe not a big difference in the raw sequencing depth

3.3 Per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density(na.rm = TRUE) +
  ggtitle("Gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)") + 
  theme_bw()

👉 The cumulative gene coverage is as expected since the highest peak is around 2 even if there is a lot of noise around 0

3.4 Per-sample expression

dat <- as.data.frame(log10(counts)) %>% 
  utils::stack() %>% 
  mutate(Level=samples_rep$Level[match(ind,csamples$SciLifeID)]) 


ggplot(dat,aes(x=values,group=ind,col=Level)) + 
  geom_density(na.rm = TRUE) + 
  ggtitle("Sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)") + 
  theme_bw()

👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at one or the other variable

  • Export raw expression data
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_merge.csv"))

 

4 Data normalisation

4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample condition and replicate.

load(file=here("data/analysis/salmon/dds_merge.rda"))

4.2 Size factors

(i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds) %>% 
  suppressMessages()

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y")
abline(h=1, col = "Red", lty = 3)

and without outliers:

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)

👉 The sequencing libraries size factor is highly variable across samples but similar for the technical replicates

4.3 Validation

Let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.

Before:

meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])

After VST normalization, the red line should be almost horizontal which indicates no dependency of variance on mean (homoscedastic).

4.4 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)

meanSdPlot(vst[rowSums(vst)>0,])

👉 We can conclude that the variance stabilization worked adequately even if the red line is not perfectly horizontal


 

4.5 QC on the normalised data

4.5.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

4.5.2 Scree plot

We define the number of variables of the model (=1) and the number of possible combinations (=8).

We plot the percentage explained by different components

  • the red line represents number of variables in the model
  • the orange line represents number of variable combinations.
nvar=1
nlevel=nlevels(dds$Level)
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)+
  ggtitle("Percentage of variance explained by different components")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

4.5.3 PCA plot

dds$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(dds$Filenames))))
pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(dds)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Level,shape=Level,text=Filenames)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.

👉 Some conditions clusterized better than others in the PCA plot

4.6 Sequencing depth

Number of genes expressed per condition at different cutoffs:

#conds <- factor(dds$Level)
conds <- factor(dds$Level)
dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=conds,
                                nrep=3)

4.7 Heatmap

Filter for noise

sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3) %>% 
  suppressWarnings()

vst.cutoff <- 2
abline(h=10000, col="Red", lty=3)

  • Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="")

  • Set the cut off to 8 in order to retrieve less than 10 000 genes
vst.cutoff <- 8

hm_reduced <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="")

👉 The different conditions are not so different in gene expression level

4.8 Clustering of samples

Done to assess the previous dendrogram’s reproducibility

hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                       method.hclust = "ward.D2", 
                       nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.

Plot the clustering with bp and au

plot(hm.pvclust, labels = conds, cex.axis=1.5)
pvrect(hm.pvclust)

👉 Some conditions clusterize better than others

bootstrapping results as a table
print(hm.pvclust, digits=3)
## 
## Cluster method: ward.D2
## Distance      : correlation
## 
## Estimates on edges:
## 
##       si    au    bp se.si se.au se.bp      v      c  pchi
## 1  0.768 0.890 0.864 0.029 0.018 0.004 -1.162  0.065 0.136
## 2  0.817 0.931 0.807 0.023 0.012 0.004 -1.173  0.307 0.788
## 3  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 4  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 5  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 6  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 7  0.832 0.911 0.935 0.028 0.018 0.003 -1.428 -0.083 0.747
## 8  0.804 0.890 0.936 0.031 0.022 0.003 -1.377 -0.148 0.730
## 9  0.573 0.786 0.789 0.036 0.025 0.004 -0.797 -0.006 0.232
## 10 0.990 0.994 0.998 0.014 0.009 0.001 -2.692 -0.151 0.902
## 11 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 12 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 14 0.786 0.861 0.968 0.041 0.032 0.002 -1.470 -0.386 0.828
## 15 0.681 0.847 0.822 0.033 0.021 0.004 -0.974  0.049 0.630
## 16 0.683 0.860 0.786 0.031 0.019 0.004 -0.936  0.142 0.932
## 17 0.085 0.578 0.501 0.037 0.030 0.005 -0.099  0.097 0.699
## 18 0.663 0.899 0.591 0.032 0.014 0.005 -0.752  0.523 0.806
## 19 0.047 0.593 0.448 0.038 0.030 0.005 -0.052  0.182 0.232
## 20 0.364 0.868 0.304 0.051 0.018 0.005 -0.302  0.814 0.610
## 21 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 22 0.053 0.852 0.164 0.076 0.022 0.004 -0.035  1.012 0.662
## 23 0.527 0.718 0.856 0.040 0.032 0.004 -0.819 -0.242 0.184
## 24 0.527 0.718 0.856 0.040 0.032 0.004 -0.819 -0.242 0.184
## 25 0.527 0.718 0.856 0.040 0.032 0.004 -0.819 -0.242 0.184
## 26 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 27 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000

 

5 Summary

The data are quite good so we can continue with our analysis

6 Session Info

Session Info
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] emoji_15.0                  vsn_3.64.0                 
##  [3] tximport_1.24.0             forcats_0.5.2              
##  [5] stringr_1.4.1               dplyr_1.0.10               
##  [7] purrr_0.3.5                 readr_2.1.3                
##  [9] tidyr_1.2.1                 tibble_3.1.8               
## [11] tidyverse_1.3.2             pvclust_2.2-0              
## [13] plotly_4.10.1               magrittr_2.0.3             
## [15] hyperSpec_0.100.0           xml2_1.3.3                 
## [17] ggplot2_3.4.0               lattice_0.20-45            
## [19] here_1.0.1                  gplots_3.1.3               
## [21] DESeq2_1.36.0               SummarizedExperiment_1.26.1
## [23] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [25] matrixStats_0.62.0          GenomicRanges_1.48.0       
## [27] GenomeInfoDb_1.32.4         IRanges_2.30.1             
## [29] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [31] data.table_1.14.4          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1           backports_1.4.1        lazyeval_0.2.2        
##   [4] splines_4.2.0          crosstalk_1.2.0        BiocParallel_1.30.4   
##   [7] digest_0.6.30          htmltools_0.5.3        fansi_1.0.3           
##  [10] memoise_2.0.1          googlesheets4_1.0.1    limma_3.52.4          
##  [13] tzdb_0.3.0             Biostrings_2.64.1      annotate_1.74.0       
##  [16] modelr_0.1.9           vroom_1.6.0            timechange_0.1.1      
##  [19] jpeg_0.1-9             colorspace_2.0-3       blob_1.2.3            
##  [22] rvest_1.0.3            haven_2.5.1            xfun_0.34             
##  [25] hexbin_1.28.2          crayon_1.5.2           RCurl_1.98-1.9        
##  [28] jsonlite_1.8.3         genefilter_1.78.0      survival_3.4-0        
##  [31] glue_1.6.2             gtable_0.3.1           gargle_1.2.1          
##  [34] zlibbioc_1.42.0        XVector_0.36.0         DelayedArray_0.22.0   
##  [37] scales_1.2.1           DBI_1.1.3              Rcpp_1.0.9            
##  [40] viridisLite_0.4.1      xtable_1.8-4           bit_4.0.4             
##  [43] preprocessCore_1.58.0  htmlwidgets_1.5.4      httr_1.4.4            
##  [46] RColorBrewer_1.1-3     ellipsis_0.3.2         farver_2.1.1          
##  [49] pkgconfig_2.0.3        XML_3.99-0.12          sass_0.4.2            
##  [52] dbplyr_2.2.1           deldir_1.0-6           locfit_1.5-9.6        
##  [55] utf8_1.2.2             labeling_0.4.2         tidyselect_1.2.0      
##  [58] rlang_1.0.6            AnnotationDbi_1.58.0   munsell_0.5.0         
##  [61] cellranger_1.1.0       tools_4.2.0            cachem_1.0.6          
##  [64] cli_3.4.1              generics_0.1.3         RSQLite_2.2.18        
##  [67] broom_1.0.1            evaluate_0.17          fastmap_1.1.0         
##  [70] yaml_2.3.6             knitr_1.40             bit64_4.0.5           
##  [73] fs_1.5.2               caTools_1.18.2         KEGGREST_1.36.3       
##  [76] brio_1.1.3             compiler_4.2.0         rstudioapi_0.14       
##  [79] png_0.1-7              testthat_3.1.5         affyio_1.66.0         
##  [82] reprex_2.0.2           geneplotter_1.74.0     bslib_0.4.1           
##  [85] stringi_1.7.8          highr_0.9              Matrix_1.5-1          
##  [88] vctrs_0.5.0            pillar_1.8.1           lifecycle_1.0.3       
##  [91] BiocManager_1.30.19    jquerylib_0.1.4        bitops_1.0-7          
##  [94] R6_2.5.1               latticeExtra_0.6-30    affy_1.74.0           
##  [97] KernSmooth_2.23-20     codetools_0.2-18       gtools_3.9.3          
## [100] assertthat_0.2.1       rprojroot_2.0.3        withr_2.5.0           
## [103] GenomeInfoDbData_1.2.8 hms_1.1.2              rmarkdown_2.17        
## [106] googledrive_2.0.0      lubridate_1.9.0        interp_1.1-3